library(tidyverse)
library(tidytext)
library(stringr)
library(tm)
library(topicmodels)
set.seed(1234)
theme_set(theme_minimal())
Import data
(friends <- read_csv("data/friends-tidytext.csv"))
Parsed with column specification:
cols(
id = col_integer(),
season = col_integer(),
episode = col_integer(),
ngram = col_integer(),
scene_num = col_integer(),
character = col_character(),
word = col_character()
)
(friends_core <- friends %>%
filter(character %in% c("Rachel", "Monica", "Phoebe", "Chandler", "Joey", "Ross")))
Most common tokens
library(wordcloud2)
library(magrittr)
friends %>%
count(word, sort = TRUE)
friends %>%
count(word, sort = TRUE) %>%
rename(freq = n) %T>%
print %>%
wordcloud2
Most predominant tokens per major character
# get tf-idf per character
(friends_tfidf <- friends_core %>%
count(character, ngram, word) %>%
bind_tf_idf(term = word, document = character, n = n))
# sort the data frame and convert word to a factor column
friends_tfidf <- friends_tfidf %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word))))
# graph the top 10 tokens for each character
friends_tfidf %>%
group_by(character) %>%
top_n(20) %>%
ungroup() %>%
ggplot(aes(word, tf_idf)) +
geom_col() +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~ character, scales = "free") +
coord_flip()
# separate graphs for each ngram
plot_friends_tfidf <- function(friends_tfidf, n_gram, n_top = 10){
friends_tfidf %>%
filter(ngram == n_gram) %>%
group_by(character) %>%
top_n(n_top) %>%
ungroup() %>%
ggplot(aes(word, tf_idf)) +
geom_col() +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~ character, scales = "free") +
coord_flip()
}
seq(from = 1, to = 5) %>%
map(~ plot_friends_tfidf(friends_tfidf, n_gram = .x))
Word co-occurrences and correlations
friends_stop_words
friends_pairs <- friends %>%
filter(ngram == 1) %>%
mutate(id_scene = str_c(season, episode, scene_num, sep = "_")) %>%
pairwise_count(item = word, feature = id_scene, sort = TRUE, upper = FALSE)
friends_pairs
library(igraph)
library(ggraph)
set.seed(1234)
friends_pairs %>%
filter(n >= 250) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
Estimate LDA model
Scene-level model
Create document-term matrix
# remove non-informative terms
## get token count-per-scene
friends_count <- friends %>%
mutate(id = str_c(season, episode, scene_num, sep = "_")) %>%
count(id, season, episode, scene_num, word)
## total number of scenes in corpus
n_scene <- n_distinct(friends_count$id)
## only keep tokens that appear in greater than 1% but fewer than
## 25% of scenes. determined via visual inspection of distribution
friends_keep <- friends_count %>%
count(word) %>%
mutate(pct = nn / n_scene) %>%
filter(pct <= .25,
pct >= .01)
## create the document-term matrix
(friends_dtm <- friends_count %>%
semi_join(friends_keep) %>%
cast_dtm(id, word, n))
Pick k
library("ldatuning")
result <- FindTopicsNumber(
friends_dtm,
topics = seq(from = 5, to = 50, by = 3),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
control = list(seed = 1234),
mc.cores = 6L,
verbose = TRUE
)
FindTopicsNumber_plot(result)
K = 50
lda_50 <- LDA(friends_dtm, k = 50, control = list(seed = 1234))
lda_50
library(LDAvis)
library(slam)
json <- createJSON(phi = posterior(lda_50)$terms,
theta = posterior(lda_50)$topics,
doc.length = row_sums(friends_dtm),
vocab = colnames(friends_dtm),
term.frequency = col_sums(friends_dtm))
serVis(json)
Episode-level model
Create document-term matrix
# remove non-informative terms
## get token count-per-scene
friends_count_ep <- friends %>%
mutate(id = str_c(season, episode, sep = "_")) %>%
count(id, season, episode, word)
## total number of scenes in corpus
n_ep <- n_distinct(friends_count_ep$id)
## only keep tokens that appear in greater than 1% but fewer than
## 25% of scenes. determined via visual inspection of distribution
friends_keep_ep <- friends_count_ep %>%
count(word) %>%
mutate(pct = nn / n_ep) %>%
filter(pct <= .93,
pct >= .05)
## create the document-term matrix
(friends_ep_dtm <- friends_count_ep %>%
semi_join(friends_keep_ep) %>%
cast_dtm(id, word, n))
Pick k
library("ldatuning")
result_ep <- FindTopicsNumber(
friends_ep_dtm,
topics = seq(from = 5, to = 50, by = 3),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
control = list(seed = 1234),
mc.cores = 6L,
verbose = TRUE
)
FindTopicsNumber_plot(result_ep)
K = 50
lda_50_ep <- LDA(friends_ep_dtm, k = 50, control = list(seed = 1234))
lda_50_ep
json_ep <- createJSON(phi = posterior(lda_50_ep)$terms,
theta = posterior(lda_50_ep)$topics,
doc.length = row_sums(friends_ep_dtm),
vocab = colnames(friends_ep_dtm),
term.frequency = col_sums(friends_ep_dtm))
serVis(json_ep)
LS0tCnRpdGxlOiAiRURBIG9uIEZyaWVuZHMgdHJhbnNjcmlwdHMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyIHNldHVwLCBpbmNsdWRlID0gRkFMU0V9CmtuaXRyOjpvcHRzX2NodW5rJHNldChjYWNoZSA9IFRSVUUpCmBgYAoKYGBge3IgcGFja2FnZXMsIGNhY2hlID0gRkFMU0UsIG1lc3NhZ2UgPSBGQUxTRSwgd2FybmluZyA9IEZBTFNFfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeSh0aWR5dGV4dCkKbGlicmFyeShzdHJpbmdyKQpsaWJyYXJ5KHRtKQpsaWJyYXJ5KHRvcGljbW9kZWxzKQoKc2V0LnNlZWQoMTIzNCkKdGhlbWVfc2V0KHRoZW1lX21pbmltYWwoKSkKYGBgCgojIEltcG9ydCBkYXRhCgpgYGB7ciBpbXBvcnQtZGF0YX0KKGZyaWVuZHMgPC0gcmVhZF9jc3YoImRhdGEvZnJpZW5kcy10aWR5dGV4dC5jc3YiKSkKCihmcmllbmRzX2NvcmUgPC0gZnJpZW5kcyAlPiUKICBmaWx0ZXIoY2hhcmFjdGVyICVpbiUgYygiUmFjaGVsIiwgIk1vbmljYSIsICJQaG9lYmUiLCAiQ2hhbmRsZXIiLCAiSm9leSIsICJSb3NzIikpKQpgYGAKCiMgTW9zdCBjb21tb24gdG9rZW5zCgpgYGB7ciB3b3JkY2xvdWR9CmxpYnJhcnkod29yZGNsb3VkMikKbGlicmFyeShtYWdyaXR0cikKCmZyaWVuZHMgJT4lCiAgY291bnQod29yZCwgc29ydCA9IFRSVUUpICU+JQogIHJlbmFtZShmcmVxID0gbikgJVQ+JQogIHByaW50ICU+JQogIHdvcmRjbG91ZDIKYGBgCgojIE1vc3QgcHJlZG9taW5hbnQgdG9rZW5zIHBlciBtYWpvciBjaGFyYWN0ZXIKCmBgYHtyIHRmLWlkZn0KIyBnZXQgdGYtaWRmIHBlciBjaGFyYWN0ZXIKKGZyaWVuZHNfdGZpZGYgPC0gZnJpZW5kc19jb3JlICU+JQogICBjb3VudChjaGFyYWN0ZXIsIG5ncmFtLCB3b3JkKSAlPiUKICAgYmluZF90Zl9pZGYodGVybSA9IHdvcmQsIGRvY3VtZW50ID0gY2hhcmFjdGVyLCBuID0gbikpCmBgYAoKYGBge3IgdGZpZGYtcGxvdH0KIyBzb3J0IHRoZSBkYXRhIGZyYW1lIGFuZCBjb252ZXJ0IHdvcmQgdG8gYSBmYWN0b3IgY29sdW1uCmZyaWVuZHNfdGZpZGYgPC0gZnJpZW5kc190ZmlkZiAlPiUKICBhcnJhbmdlKGRlc2ModGZfaWRmKSkgJT4lCiAgbXV0YXRlKHdvcmQgPSBmYWN0b3Iod29yZCwgbGV2ZWxzID0gcmV2KHVuaXF1ZSh3b3JkKSkpKQoKIyBncmFwaCB0aGUgdG9wIDEwIHRva2VucyBmb3IgZWFjaCBjaGFyYWN0ZXIKZnJpZW5kc190ZmlkZiAlPiUKICBncm91cF9ieShjaGFyYWN0ZXIpICU+JQogIHRvcF9uKDIwKSAlPiUKICB1bmdyb3VwKCkgJT4lCiAgZ2dwbG90KGFlcyh3b3JkLCB0Zl9pZGYpKSArCiAgZ2VvbV9jb2woKSArCiAgbGFicyh4ID0gTlVMTCwgeSA9ICJ0Zi1pZGYiKSArCiAgZmFjZXRfd3JhcCh+IGNoYXJhY3Rlciwgc2NhbGVzID0gImZyZWUiKSArCiAgY29vcmRfZmxpcCgpCgojIHNlcGFyYXRlIGdyYXBocyBmb3IgZWFjaCBuZ3JhbQpwbG90X2ZyaWVuZHNfdGZpZGYgPC0gZnVuY3Rpb24oZnJpZW5kc190ZmlkZiwgbl9ncmFtLCBuX3RvcCA9IDEwKXsKICBmcmllbmRzX3RmaWRmICU+JQogICAgZmlsdGVyKG5ncmFtID09IG5fZ3JhbSkgJT4lCiAgICBncm91cF9ieShjaGFyYWN0ZXIpICU+JQogICAgdG9wX24obl90b3ApICU+JQogICAgdW5ncm91cCgpICU+JQogICAgZ2dwbG90KGFlcyh3b3JkLCB0Zl9pZGYpKSArCiAgICBnZW9tX2NvbCgpICsKICAgIGxhYnMoeCA9IE5VTEwsIHkgPSAidGYtaWRmIikgKwogICAgZmFjZXRfd3JhcCh+IGNoYXJhY3Rlciwgc2NhbGVzID0gImZyZWUiKSArCiAgICBjb29yZF9mbGlwKCkKfQoKc2VxKGZyb20gPSAxLCB0byA9IDUpICU+JQogIG1hcCh+IHBsb3RfZnJpZW5kc190ZmlkZihmcmllbmRzX3RmaWRmLCBuX2dyYW0gPSAueCkpCmBgYAoKIyBXb3JkIGNvLW9jY3VycmVuY2VzIGFuZCBjb3JyZWxhdGlvbnMKCmBgYHtyIHBhaXJ3aXNlLWNvdW50fQpmcmllbmRzX3N0b3Bfd29yZHMKCmZyaWVuZHNfcGFpcnMgPC0gZnJpZW5kcyAlPiUKICBmaWx0ZXIobmdyYW0gPT0gMSkgJT4lCiAgbXV0YXRlKGlkX3NjZW5lID0gc3RyX2Moc2Vhc29uLCBlcGlzb2RlLCBzY2VuZV9udW0sIHNlcCA9ICJfIikpICU+JQogIHBhaXJ3aXNlX2NvdW50KGl0ZW0gPSB3b3JkLCBmZWF0dXJlID0gaWRfc2NlbmUsIHNvcnQgPSBUUlVFLCB1cHBlciA9IEZBTFNFKQpmcmllbmRzX3BhaXJzCmBgYAoKYGBge3IgcGFpcndpc2UtcGxvdH0KbGlicmFyeShpZ3JhcGgpCmxpYnJhcnkoZ2dyYXBoKQoKc2V0LnNlZWQoMTIzNCkKCmZyaWVuZHNfcGFpcnMgJT4lCiAgZmlsdGVyKG4gPj0gMjUwKSAlPiUKICBncmFwaF9mcm9tX2RhdGFfZnJhbWUoKSAlPiUKICBnZ3JhcGgobGF5b3V0ID0gImZyIikgKwogIGdlb21fZWRnZV9saW5rKGFlcyhlZGdlX2FscGhhID0gbiwgZWRnZV93aWR0aCA9IG4pLCBlZGdlX2NvbG91ciA9ICJjeWFuNCIpICsKICBnZW9tX25vZGVfcG9pbnQoc2l6ZSA9IDUpICsKICBnZW9tX25vZGVfdGV4dChhZXMobGFiZWwgPSBuYW1lKSwgcmVwZWwgPSBUUlVFLCAKICAgICAgICAgICAgICAgICBwb2ludC5wYWRkaW5nID0gdW5pdCgwLjIsICJsaW5lcyIpKSArCiAgdGhlbWVfdm9pZCgpCmBgYAoKIyBFc3RpbWF0ZSBMREEgbW9kZWwKCiMjIFNjZW5lLWxldmVsIG1vZGVsCgojIyMgQ3JlYXRlIGRvY3VtZW50LXRlcm0gbWF0cml4CgpgYGB7ciBkdG19CiMgcmVtb3ZlIG5vbi1pbmZvcm1hdGl2ZSB0ZXJtcwojIyBnZXQgdG9rZW4gY291bnQtcGVyLXNjZW5lCmZyaWVuZHNfY291bnQgPC0gZnJpZW5kcyAlPiUKICBtdXRhdGUoaWQgPSBzdHJfYyhzZWFzb24sIGVwaXNvZGUsIHNjZW5lX251bSwgc2VwID0gIl8iKSkgJT4lCiAgY291bnQoaWQsIHNlYXNvbiwgZXBpc29kZSwgc2NlbmVfbnVtLCB3b3JkKQoKIyMgdG90YWwgbnVtYmVyIG9mIHNjZW5lcyBpbiBjb3JwdXMKbl9zY2VuZSA8LSBuX2Rpc3RpbmN0KGZyaWVuZHNfY291bnQkaWQpCgojIyBvbmx5IGtlZXAgdG9rZW5zIHRoYXQgYXBwZWFyIGluIGdyZWF0ZXIgdGhhbiAxJSBidXQgZmV3ZXIgdGhhbgojIyAyNSUgb2Ygc2NlbmVzLiBkZXRlcm1pbmVkIHZpYSB2aXN1YWwgaW5zcGVjdGlvbiBvZiBkaXN0cmlidXRpb24KZnJpZW5kc19rZWVwIDwtIGZyaWVuZHNfY291bnQgJT4lCiAgY291bnQod29yZCkgJT4lCiAgbXV0YXRlKHBjdCA9IG5uIC8gbl9zY2VuZSkgJT4lCiAgZmlsdGVyKHBjdCA8PSAuMjUsCiAgICAgICAgIHBjdCA+PSAuMDEpCgojIyBjcmVhdGUgdGhlIGRvY3VtZW50LXRlcm0gbWF0cml4CihmcmllbmRzX2R0bSA8LSBmcmllbmRzX2NvdW50ICU+JQogIHNlbWlfam9pbihmcmllbmRzX2tlZXApICU+JQogIGNhc3RfZHRtKGlkLCB3b3JkLCBuKSkKYGBgCgojIyMgUGljayBga2AKCmBgYHtyIGxkYS1rfQpsaWJyYXJ5KCJsZGF0dW5pbmciKQoKcmVzdWx0IDwtIEZpbmRUb3BpY3NOdW1iZXIoCiAgZnJpZW5kc19kdG0sCiAgdG9waWNzID0gc2VxKGZyb20gPSA1LCB0byA9IDUwLCBieSA9IDMpLAogIG1ldHJpY3MgPSBjKCJHcmlmZml0aHMyMDA0IiwgIkNhb0p1YW4yMDA5IiwgIkFydW4yMDEwIiwgIkRldmVhdWQyMDE0IiksCiAgY29udHJvbCA9IGxpc3Qoc2VlZCA9IDEyMzQpLAogIG1jLmNvcmVzID0gNkwsCiAgdmVyYm9zZSA9IFRSVUUKKQoKRmluZFRvcGljc051bWJlcl9wbG90KHJlc3VsdCkKYGBgCgojIyMgYEsgPSA1MGAKCmBgYHtyIGxkYS01MH0KbGRhXzUwIDwtIExEQShmcmllbmRzX2R0bSwgayA9IDUwLCBjb250cm9sID0gbGlzdChzZWVkID0gMTIzNCkpCmxkYV81MApgYGAKCmBgYHtyIGxkYXZpc30KbGlicmFyeShMREF2aXMpCmxpYnJhcnkoc2xhbSkKCmpzb24gPC0gY3JlYXRlSlNPTihwaGkgPSBwb3N0ZXJpb3IobGRhXzUwKSR0ZXJtcywKICAgICAgICAgICAgICAgICAgIHRoZXRhID0gcG9zdGVyaW9yKGxkYV81MCkkdG9waWNzLAogICAgICAgICAgICAgICAgICAgZG9jLmxlbmd0aCA9IHJvd19zdW1zKGZyaWVuZHNfZHRtKSwKICAgICAgICAgICAgICAgICAgIHZvY2FiID0gY29sbmFtZXMoZnJpZW5kc19kdG0pLAogICAgICAgICAgICAgICAgICAgdGVybS5mcmVxdWVuY3kgPSBjb2xfc3VtcyhmcmllbmRzX2R0bSkpCnNlclZpcyhqc29uKQpgYGAKCiMjIEVwaXNvZGUtbGV2ZWwgbW9kZWwKCiMjIyBDcmVhdGUgZG9jdW1lbnQtdGVybSBtYXRyaXgKCmBgYHtyIGR0bS1lcH0KIyByZW1vdmUgbm9uLWluZm9ybWF0aXZlIHRlcm1zCiMjIGdldCB0b2tlbiBjb3VudC1wZXItc2NlbmUKZnJpZW5kc19jb3VudF9lcCA8LSBmcmllbmRzICU+JQogIG11dGF0ZShpZCA9IHN0cl9jKHNlYXNvbiwgZXBpc29kZSwgc2VwID0gIl8iKSkgJT4lCiAgY291bnQoaWQsIHNlYXNvbiwgZXBpc29kZSwgd29yZCkKCiMjIHRvdGFsIG51bWJlciBvZiBzY2VuZXMgaW4gY29ycHVzCm5fZXAgPC0gbl9kaXN0aW5jdChmcmllbmRzX2NvdW50X2VwJGlkKQoKIyMgb25seSBrZWVwIHRva2VucyB0aGF0IGFwcGVhciBpbiBncmVhdGVyIHRoYW4gMSUgYnV0IGZld2VyIHRoYW4KIyMgMjUlIG9mIHNjZW5lcy4gZGV0ZXJtaW5lZCB2aWEgdmlzdWFsIGluc3BlY3Rpb24gb2YgZGlzdHJpYnV0aW9uCmZyaWVuZHNfa2VlcF9lcCA8LSBmcmllbmRzX2NvdW50X2VwICU+JQogIGNvdW50KHdvcmQpICU+JQogIG11dGF0ZShwY3QgPSBubiAvIG5fZXApICU+JQogIGZpbHRlcihwY3QgPD0gLjkzLAogICAgICAgICBwY3QgPj0gLjA1KQoKIyMgY3JlYXRlIHRoZSBkb2N1bWVudC10ZXJtIG1hdHJpeAooZnJpZW5kc19lcF9kdG0gPC0gZnJpZW5kc19jb3VudF9lcCAlPiUKICBzZW1pX2pvaW4oZnJpZW5kc19rZWVwX2VwKSAlPiUKICBjYXN0X2R0bShpZCwgd29yZCwgbikpCmBgYAoKIyMjIFBpY2sgYGtgCgpgYGB7ciBsZGEtay1lcH0KbGlicmFyeSgibGRhdHVuaW5nIikKCnJlc3VsdF9lcCA8LSBGaW5kVG9waWNzTnVtYmVyKAogIGZyaWVuZHNfZXBfZHRtLAogIHRvcGljcyA9IHNlcShmcm9tID0gNSwgdG8gPSA1MCwgYnkgPSAzKSwKICBtZXRyaWNzID0gYygiR3JpZmZpdGhzMjAwNCIsICJDYW9KdWFuMjAwOSIsICJBcnVuMjAxMCIsICJEZXZlYXVkMjAxNCIpLAogIGNvbnRyb2wgPSBsaXN0KHNlZWQgPSAxMjM0KSwKICBtYy5jb3JlcyA9IDZMLAogIHZlcmJvc2UgPSBUUlVFCikKCkZpbmRUb3BpY3NOdW1iZXJfcGxvdChyZXN1bHRfZXApCmBgYAoKIyMjIGBLID0gNTBgCgpgYGB7ciBsZGEtNTAtZXB9CmxkYV81MF9lcCA8LSBMREEoZnJpZW5kc19lcF9kdG0sIGsgPSA1MCwgY29udHJvbCA9IGxpc3Qoc2VlZCA9IDEyMzQpKQpsZGFfNTBfZXAKYGBgCgpgYGB7ciBsZGF2aXMtZXB9Cmpzb25fZXAgPC0gY3JlYXRlSlNPTihwaGkgPSBwb3N0ZXJpb3IobGRhXzUwX2VwKSR0ZXJtcywKICAgICAgICAgICAgICAgICAgIHRoZXRhID0gcG9zdGVyaW9yKGxkYV81MF9lcCkkdG9waWNzLAogICAgICAgICAgICAgICAgICAgZG9jLmxlbmd0aCA9IHJvd19zdW1zKGZyaWVuZHNfZXBfZHRtKSwKICAgICAgICAgICAgICAgICAgIHZvY2FiID0gY29sbmFtZXMoZnJpZW5kc19lcF9kdG0pLAogICAgICAgICAgICAgICAgICAgdGVybS5mcmVxdWVuY3kgPSBjb2xfc3VtcyhmcmllbmRzX2VwX2R0bSkpCnNlclZpcyhqc29uX2VwKQpgYGAKCgoKCgoKCgoKCgoK